In [1]:
import os, sys
import graphlab as gl
import graphlab.aggregate as agg
from tqdm import tqdm_notebook as tqdm
# set canvas path
# gl.canvas.set_target('ipynb')
%matplotlib inline
import matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [3]:
products = gl.SFrame('data/amazon_baby.gl/')
In [4]:
products.head()
Out[4]:
In [6]:
products['word_count'] = gl.text_analytics.count_words(products['review'])
In [7]:
products.head()
Out[7]:
In [8]:
giraffe_reviews = products[products['name'] == 'Vulli Sophie the Giraffe Teether']
In [9]:
len(giraffe_reviews)
Out[9]:
In [11]:
sns.countplot(x="rating", data=products.to_dataframe())
Out[11]:
In [14]:
#ignore all 3 star reviews
In [15]:
products = products[products['rating'] != 3]
In [16]:
len(products)
Out[16]:
In [17]:
#positive sentiment is 4/5 star ; negative sentiment is 1/2 star
In [18]:
products['sentiment'] = products['rating'] >= 4
In [19]:
products.head()
Out[19]:
In [20]:
train_data, test_data = products.random_split(0.8, seed=0)
In [21]:
sentiment_model = gl.logistic_classifier.create(train_data,
target='sentiment',
features=['word_count'],
validation_set=test_data)
In [26]:
sentiment_model.evaluate(test_data, metric='roc_curve')
Out[26]:
In [31]:
giraffe_reviews['predicted_sentiment'] = sentiment_model.predict(giraffe_reviews, output_type='probability')
In [32]:
giraffe_reviews.head()
Out[32]:
In [33]:
giraffe_reviews = giraffe_reviews.sort(
'predicted_sentiment',
ascending=False
)
In [34]:
giraffe_reviews.head()
Out[34]:
In [35]:
giraffe_reviews[0]['review']
Out[35]:
In [36]:
giraffe_reviews[1]['review']
Out[36]:
In [37]:
giraffe_reviews[-1]['review']
Out[37]:
In [38]:
giraffe_reviews[-2]['review']
Out[38]: